# Statistical Modeling Functions
# Alex F. Gazmararian
# agazmararian@gmail.com

library(here)

# Global vcov cache for building during full mode runs
if (!exists("VCOV_CACHE")) {
  VCOV_CACHE <- list()
}

#' Estimate regression model with flexible specifications
#' 
#' Supports dual-mode replication:
#' - In "full" mode: computes Conley SEs and caches vcov matrices
#' - In "anonymized" mode: uses pre-cached vcov matrices (no coordinates needed)
#' 
#' @param varname Main explanatory variable name
#' @param covar.list.in List of control variables (defaults to global covar.list if available)
#' @param hetvar Heterogeneity variable name for interactions (default: "")
#' @param cutoff Distance cutoff for Conley standard errors (default: 50)
#' @param outcome Outcome variable name (default: "greenproj_bin")
#' @param data.in Input data frame (required)
#' @param se Standard error type: "conley", "conley_spatial", "state", "hc1" (default: "conley")
#' @param conley.distance Distance function for Conley SEs (default: "triangular")
#' @param conley.pixel Pixel size for Conley SEs (default: 0)
#' @param fe Fixed effects specification (default: "| state")
#' @param weights Survey weights (optional)
#' @param model_name Optional model name for vcov caching (auto-generated if NULL)
#' @param cache_vcov Whether to cache vcov matrix in full mode (default: TRUE for Conley SEs)
#' @return Model results
est_model <- function(varname, covar.list.in = NULL, hetvar = "", cutoff = 50, outcome = "greenproj_bin", 
                      data.in, se = "conley", conley.distance = "triangular", conley.pixel = 0, 
                      fe = "| state", weights = NULL, model_name = NULL, cache_vcov = TRUE) {
  
  # Use global covar.list if covar.list.in not provided and it exists
  if (is.null(covar.list.in) && exists("covar.list", envir = .GlobalEnv)) {
    covar.list.in <- get("covar.list", envir = .GlobalEnv)
  } else if (is.null(covar.list.in)) {
    stop("covar.list.in must be provided or global covar.list must exist")
  }
  
  # Construct formula
  if (hetvar != "") {
    # Interaction model - use sophisticated approach from visibility version
    rhs <- paste0(varname, " * ", hetvar, " + ", paste(covar.list.in, collapse = " + "))
  } else {
    # Main effects model - use simple approach from config version
    rhs <- paste0(varname, " + ", paste(covar.list.in, collapse = " + "))
  }
  
  formula_str <- paste(outcome, "~", rhs, fe)
  model_formula <- as.formula(formula_str)
  
  # Generate model name for caching if not provided
  if (is.null(model_name)) {
    weight_suffix <- if (!is.null(weights)) "_wt" else ""
    model_name <- paste0(outcome, "_", varname, "_", se, "_", cutoff, weight_suffix)
  }
  
  # Check replication mode
  replication_mode <- if (exists("REPLICATION_MODE", envir = .GlobalEnv)) {
    get("REPLICATION_MODE", envir = .GlobalEnv)
  } else {
    "full"  # Default to full mode if not set
  }
  
  # Estimate model with different standard error types
  if (se == "conley") {
    # Handle Conley SEs differently based on replication mode
    if (replication_mode == "anonymized") {
      # Anonymized mode: use cached vcov matrix
      vcov_cache_file <- here("data", "cache", "vcov_conley_cache.rds")
      
      if (!file.exists(vcov_cache_file)) {
        stop("Anonymized mode requires cached vcov matrices. File not found: ", vcov_cache_file)
      }
      
      vcov_cache <- readRDS(vcov_cache_file)
      
      if (!model_name %in% names(vcov_cache)) {
        stop("Model '", model_name, "' not found in vcov cache. ",
             "This may indicate the cache needs to be regenerated in full mode.")
      }
      
      # Estimate model without spatial vcov (will replace later)
      model <- fixest::feols(
        fml = model_formula,
        data = data.in,
        weights = weights,
        vcov = "hetero"  # Placeholder - we'll replace with cached vcov
      )
      
      # Replace vcov with cached version
      cached_vcov <- vcov_cache[[model_name]]
      
      # Validate dimensions match - STRICT: abort if mismatch
      if (nrow(cached_vcov) != length(coef(model))) {
        stop("Cached vcov dimensions (", nrow(cached_vcov), "x", ncol(cached_vcov), 
             ") don't match model coefficients (", length(coef(model)), ") for: ", model_name,
             ".\n  This indicates the vcov cache was generated with a different model specification.",
             "\n  Please regenerate the cache by running the analysis in full mode.",
             "\n  Cache file: ", vcov_cache_file)
      } else {
        # Apply cached vcov properly using summary() to maintain model consistency
        # This ensures modelsummary and broom::tidy work correctly
        model <- summary(model, vcov = cached_vcov)
        attr(model, "vcov_type") <- "Conley (cached)"
      }
      
    } else {
      # Full mode: compute fresh Conley SEs
      if (!all(c("lat_zip", "lon_zip") %in% names(data.in))) {
        stop("lat_zip and lon_zip columns required for spatial Conley standard errors")
      }
      
      model <- fixest::feols(
        fml = model_formula,
        data = data.in,
        weights = weights,
        vcov = fixest::vcov_conley(cutoff = cutoff, lat = "lat_zip", lon = "lon_zip", 
                                  distance = conley.distance, pixel = conley.pixel)
      )
      
      # Cache vcov matrix for future anonymized runs
      if (cache_vcov) {
        VCOV_CACHE[[model_name]] <<- vcov(model)
      }
    }
  } else if (se == "state") {
    # State-clustered standard errors
    model <- fixest::feols(
      fml = model_formula,
      data = data.in,
      weights = weights,
      cluster = "state"
    )
  } else if (se == "hc1") {
    # Heteroskedasticity-robust standard errors
    model <- fixest::feols(
      fml = model_formula,
      data = data.in,
      weights = weights,
      vcov = "hetero"
    )
  } else {
    # General case - pass se directly to vcov (from visibility version)
    model <- fixest::feols(
      fml = model_formula,
      data = data.in,
      weights = weights,
      vcov = se
    )
  }
  
  return(model)
}

#' Save accumulated vcov cache to disk
#' Call this at the end of analysis scripts in full mode
#' @param cache_file Path to cache file
#' @export
save_vcov_cache <- function(cache_file = here("data", "cache", "vcov_conley_cache.rds")) {
  if (length(VCOV_CACHE) > 0) {
    # Merge with existing cache if present, with new entries taking precedence
    if (file.exists(cache_file)) {
      existing <- readRDS(cache_file)
      # Remove entries from existing that are in VCOV_CACHE (new entries take precedence)
      existing <- existing[!names(existing) %in% names(VCOV_CACHE)]
      VCOV_CACHE <- c(VCOV_CACHE, existing)
    }
    
    dir.create(dirname(cache_file), recursive = TRUE, showWarnings = FALSE)
    saveRDS(VCOV_CACHE, cache_file)
    message("Saved ", length(VCOV_CACHE), " vcov matrices to cache: ", cache_file)
  }
}

#' Clear the in-memory vcov cache
#' @export
clear_vcov_cache <- function() {
  VCOV_CACHE <<- list()
  message("Cleared vcov cache")
}
